{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 第三节：特征工程与基线模型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tqdm import tqdm  \n",
    "\n",
    "class _Data_Preprocess:\n",
    "    def __init__(self):\n",
    "        self.int8_max = np.iinfo(np.int8).max\n",
    "        self.int8_min = np.iinfo(np.int8).min\n",
    "\n",
    "        self.int16_max = np.iinfo(np.int16).max\n",
    "        self.int16_min = np.iinfo(np.int16).min\n",
    "\n",
    "        self.int32_max = np.iinfo(np.int32).max\n",
    "        self.int32_min = np.iinfo(np.int32).min\n",
    "\n",
    "        self.int64_max = np.iinfo(np.int64).max\n",
    "        self.int64_min = np.iinfo(np.int64).min\n",
    "\n",
    "        self.float16_max = np.finfo(np.float16).max\n",
    "        self.float16_min = np.finfo(np.float16).min\n",
    "\n",
    "        self.float32_max = np.finfo(np.float32).max\n",
    "        self.float32_min = np.finfo(np.float32).min\n",
    "\n",
    "        self.float64_max = np.finfo(np.float64).max\n",
    "        self.float64_min = np.finfo(np.float64).min\n",
    "\n",
    "    def _get_type(self, min_val, max_val, types):\n",
    "        if types == 'int':\n",
    "            if max_val <= self.int8_max and min_val >= self.int8_min:\n",
    "                return np.int8\n",
    "            elif max_val <= self.int16_max <= max_val and min_val >= self.int16_min:\n",
    "                return np.int16\n",
    "            elif max_val <= self.int32_max and min_val >= self.int32_min:\n",
    "                return np.int32\n",
    "            return None\n",
    "\n",
    "        elif types == 'float':\n",
    "            if max_val <= self.float16_max and min_val >= self.float16_min:\n",
    "                return np.float16\n",
    "            if max_val <= self.float32_max and min_val >= self.float32_min:\n",
    "                return np.float32\n",
    "            if max_val <= self.float64_max and min_val >= self.float64_min:\n",
    "                return np.float64\n",
    "            return None\n",
    "\n",
    "    def _memory_process(self, df):\n",
    "        init_memory = df.memory_usage().sum() / 1024 ** 2 / 1024\n",
    "        print('Original data occupies {} GB memory.'.format(init_memory))\n",
    "        df_cols = df.columns\n",
    "\n",
    "          \n",
    "        for col in tqdm_notebook(df_cols):\n",
    "            try:\n",
    "                if 'float' in str(df[col].dtypes):\n",
    "                    max_val = df[col].max()\n",
    "                    min_val = df[col].min()\n",
    "                    trans_types = self._get_type(min_val, max_val, 'float')\n",
    "                    if trans_types is not None:\n",
    "                        df[col] = df[col].astype(trans_types)\n",
    "                elif 'int' in str(df[col].dtypes):\n",
    "                    max_val = df[col].max()\n",
    "                    min_val = df[col].min()\n",
    "                    trans_types = self._get_type(min_val, max_val, 'int')\n",
    "                    if trans_types is not None:\n",
    "                        df[col] = df[col].astype(trans_types)\n",
    "            except:\n",
    "                print(' Can not do any process for column, {}.'.format(col)) \n",
    "        afterprocess_memory = df.memory_usage().sum() / 1024 ** 2 / 1024\n",
    "        print('After processing, the data occupies {} GB memory.'.format(afterprocess_memory))\n",
    "        return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 3.3 基线模型"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.3.1 数据读取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import lightgbm as lgb\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 数据读取"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "path  = '../security_data/'\n",
    "train = pd.read_csv(path + 'security_train.csv')\n",
    "test  = pd.read_csv(path + 'security_test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>file_id</th>\n",
       "      <th>label</th>\n",
       "      <th>api</th>\n",
       "      <th>tid</th>\n",
       "      <th>index</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>LdrLoadDll</td>\n",
       "      <td>2488</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>LdrGetProcedureAddress</td>\n",
       "      <td>2488</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>LdrGetProcedureAddress</td>\n",
       "      <td>2488</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>LdrGetProcedureAddress</td>\n",
       "      <td>2488</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>LdrGetProcedureAddress</td>\n",
       "      <td>2488</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   file_id  label                     api   tid  index\n",
       "0        1      5              LdrLoadDll  2488      0\n",
       "1        1      5  LdrGetProcedureAddress  2488      1\n",
       "2        1      5  LdrGetProcedureAddress  2488      2\n",
       "3        1      5  LdrGetProcedureAddress  2488      3\n",
       "4        1      5  LdrGetProcedureAddress  2488      4"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.3.2 特征工程 "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "def simple_sts_features(df):\n",
    "    simple_fea             = pd.DataFrame()\n",
    "    simple_fea['file_id']  = df['file_id'].unique()\n",
    "    simple_fea             = simple_fea.sort_values('file_id')\n",
    "     \n",
    "    df_grp = df.groupby('file_id')\n",
    "    simple_fea['file_id_api_count']   = df_grp['api'].count().values\n",
    "    simple_fea['file_id_api_nunique'] = df_grp['api'].nunique().values\n",
    "    \n",
    "    simple_fea['file_id_tid_count']   = df_grp['tid'].count().values\n",
    "    simple_fea['file_id_tid_nunique'] = df_grp['tid'].nunique().values\n",
    "    \n",
    "    simple_fea['file_id_index_count']   = df_grp['index'].count().values\n",
    "    simple_fea['file_id_index_nunique'] = df_grp['index'].nunique().values\n",
    "    \n",
    "    return simple_fea"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 1.4 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "simple_train_fea1 = simple_sts_features(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 23.9 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "simple_test_fea1 = simple_sts_features(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def simple_numerical_sts_features(df):\n",
    "    simple_numerical_fea             = pd.DataFrame()\n",
    "    simple_numerical_fea['file_id']  = df['file_id'].unique()\n",
    "    simple_numerical_fea             = simple_numerical_fea.sort_values('file_id')\n",
    "     \n",
    "    df_grp = df.groupby('file_id')\n",
    "    \n",
    "    simple_numerical_fea['file_id_tid_mean']  = df_grp['tid'].mean().values\n",
    "    simple_numerical_fea['file_id_tid_min']   = df_grp['tid'].min().values\n",
    "    simple_numerical_fea['file_id_tid_std']   = df_grp['tid'].std().values\n",
    "    simple_numerical_fea['file_id_tid_max']   = df_grp['tid'].max().values\n",
    "    \n",
    "    \n",
    "    simple_numerical_fea['file_id_index_mean']= df_grp['index'].mean().values\n",
    "    simple_numerical_fea['file_id_index_min'] = df_grp['index'].min().values\n",
    "    simple_numerical_fea['file_id_index_std'] = df_grp['index'].std().values\n",
    "    simple_numerical_fea['file_id_index_max'] = df_grp['index'].max().values\n",
    "    \n",
    "    return simple_numerical_fea"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 172 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "simple_train_fea2 = simple_numerical_sts_features(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Wall time: 18 ms\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "simple_test_fea2 = simple_numerical_sts_features(test)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.3.3 基线构建"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_label = train[['file_id','label']].drop_duplicates(subset = ['file_id','label'], keep = 'first')\n",
    "test_submit = test[['file_id']].drop_duplicates(subset = ['file_id'], keep = 'first')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "### 训练集&测试集构建\n",
    "train_data = train_label.merge(simple_train_fea1, on ='file_id', how='left')\n",
    "train_data = train_data.merge(simple_train_fea2, on ='file_id', how='left')\n",
    "\n",
    "test_submit = test_submit.merge(simple_test_fea1, on ='file_id', how='left')\n",
    "test_submit = test_submit.merge(simple_test_fea2, on ='file_id', how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "def lgb_logloss(preds,data):\n",
    "    labels_ = data.get_label()             \n",
    "    classes_ = np.unique(labels_) \n",
    "    preds_prob = []\n",
    "    for i in range(len(classes_)):\n",
    "        preds_prob.append(preds[i*len(labels_):(i+1) * len(labels_)] )\n",
    "        \n",
    "    preds_prob_ = np.vstack(preds_prob) \n",
    "    \n",
    "    loss = []\n",
    "    for i in range(preds_prob_.shape[1]):     # 样本个数\n",
    "        sum_ = 0\n",
    "        for j in range(preds_prob_.shape[0]): #类别个数\n",
    "            pred = preds_prob_[j,i]           # 第i个样本预测为第j类的概率\n",
    "            if  j == labels_[i]:\n",
    "                sum_ += np.log(pred)\n",
    "            else:\n",
    "                sum_ += np.log(1 - pred)\n",
    "        loss.append(sum_)       \n",
    "    return 'loss is: ',-1 * (np.sum(loss) / preds_prob_.shape[1]),False"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "### 模型验证\n",
    "train_features = [col for col in train_data.columns if col not in ['label','file_id']]\n",
    "train_label    = 'label'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fold n°0\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[50]\ttraining's multi_logloss: 1.83717\ttraining's loss is: : 2.41456\tvalid_1's multi_logloss: 1.28536\tvalid_1's loss is: : 0.941228\n",
      "[100]\ttraining's multi_logloss: 1.83717\ttraining's loss is: : 2.41456\tvalid_1's multi_logloss: 1.28536\tvalid_1's loss is: : 0.941228\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's multi_logloss: 1.83717\ttraining's loss is: : 2.41456\tvalid_1's multi_logloss: 1.28536\tvalid_1's loss is: : 0.941228\n",
      "fold n°1\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[50]\ttraining's multi_logloss: 1.77226\ttraining's loss is: : 2.32838\tvalid_1's multi_logloss: 2.10695\tvalid_1's loss is: : 1.86109\n",
      "[100]\ttraining's multi_logloss: 1.77226\ttraining's loss is: : 2.32838\tvalid_1's multi_logloss: 2.10695\tvalid_1's loss is: : 1.86109\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's multi_logloss: 1.77226\ttraining's loss is: : 2.32838\tvalid_1's multi_logloss: 2.10695\tvalid_1's loss is: : 1.86109\n",
      "fold n°2\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[50]\ttraining's multi_logloss: 1.79063\ttraining's loss is: : 2.32093\tvalid_1's multi_logloss: 1.67573\tvalid_1's loss is: : 1.91268\n",
      "[100]\ttraining's multi_logloss: 1.79063\ttraining's loss is: : 2.32093\tvalid_1's multi_logloss: 1.67573\tvalid_1's loss is: : 1.91268\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's multi_logloss: 1.79063\ttraining's loss is: : 2.32093\tvalid_1's multi_logloss: 1.67573\tvalid_1's loss is: : 1.91268\n",
      "fold n°3\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[50]\ttraining's multi_logloss: 1.79651\ttraining's loss is: : 2.36572\tvalid_1's multi_logloss: 1.92355\tvalid_1's loss is: : 1.42824\n",
      "[100]\ttraining's multi_logloss: 1.79651\ttraining's loss is: : 2.36572\tvalid_1's multi_logloss: 1.92355\tvalid_1's loss is: : 1.42824\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's multi_logloss: 1.79651\ttraining's loss is: : 2.36572\tvalid_1's multi_logloss: 1.92355\tvalid_1's loss is: : 1.42824\n",
      "fold n°4\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[50]\ttraining's multi_logloss: 1.70379\ttraining's loss is: : 2.27265\tvalid_1's multi_logloss: 2.91788\tvalid_1's loss is: : 3.32694\n",
      "[100]\ttraining's multi_logloss: 1.70379\ttraining's loss is: : 2.27265\tvalid_1's multi_logloss: 2.91788\tvalid_1's loss is: : 3.32694\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's multi_logloss: 1.70379\ttraining's loss is: : 2.27265\tvalid_1's multi_logloss: 2.91788\tvalid_1's loss is: : 3.32694\n",
      "Wall time: 9.94 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "from sklearn.model_selection import StratifiedKFold,KFold\n",
    "params = {\n",
    "        'task':'train', \n",
    "        'num_leaves': 255,\n",
    "        'objective': 'multiclass',\n",
    "        'num_class': 8,\n",
    "        'min_data_in_leaf': 50,\n",
    "        'learning_rate': 0.05,\n",
    "        'feature_fraction': 0.85,\n",
    "        'bagging_fraction': 0.85,\n",
    "        'bagging_freq': 5, \n",
    "        'max_bin':128,\n",
    "        'random_state':100\n",
    "    }   \n",
    "\n",
    "folds = KFold(n_splits=5, shuffle=True, random_state=15)\n",
    "oof = np.zeros(len(train))\n",
    "\n",
    "predict_res = 0\n",
    "models = []\n",
    "for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)):\n",
    "    print(\"fold n°{}\".format(fold_))\n",
    "    trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values)\n",
    "    val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values) \n",
    "    \n",
    "    clf = lgb.train(params, trn_data, num_boost_round=2000,valid_sets=[trn_data,val_data], verbose_eval=50, early_stopping_rounds=100, feval=lgb_logloss) \n",
    "    models.append(clf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "fold n°0\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[50]\ttraining's multi_logloss: 1.83717\ttraining's loss is: : 2.41456\tvalid_1's multi_logloss: 1.28536\tvalid_1's loss is: : 0.941228\n",
      "[100]\ttraining's multi_logloss: 1.83717\ttraining's loss is: : 2.41456\tvalid_1's multi_logloss: 1.28536\tvalid_1's loss is: : 0.941228\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's multi_logloss: 1.83717\ttraining's loss is: : 2.41456\tvalid_1's multi_logloss: 1.28536\tvalid_1's loss is: : 0.941228\n",
      "fold n°1\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[50]\ttraining's multi_logloss: 1.77226\ttraining's loss is: : 2.32838\tvalid_1's multi_logloss: 2.10695\tvalid_1's loss is: : 1.86109\n",
      "[100]\ttraining's multi_logloss: 1.77226\ttraining's loss is: : 2.32838\tvalid_1's multi_logloss: 2.10695\tvalid_1's loss is: : 1.86109\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's multi_logloss: 1.77226\ttraining's loss is: : 2.32838\tvalid_1's multi_logloss: 2.10695\tvalid_1's loss is: : 1.86109\n",
      "fold n°2\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[50]\ttraining's multi_logloss: 1.79063\ttraining's loss is: : 2.32093\tvalid_1's multi_logloss: 1.67573\tvalid_1's loss is: : 1.91268\n",
      "[100]\ttraining's multi_logloss: 1.79063\ttraining's loss is: : 2.32093\tvalid_1's multi_logloss: 1.67573\tvalid_1's loss is: : 1.91268\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's multi_logloss: 1.79063\ttraining's loss is: : 2.32093\tvalid_1's multi_logloss: 1.67573\tvalid_1's loss is: : 1.91268\n",
      "fold n°3\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[50]\ttraining's multi_logloss: 1.79651\ttraining's loss is: : 2.36572\tvalid_1's multi_logloss: 1.92355\tvalid_1's loss is: : 1.42824\n",
      "[100]\ttraining's multi_logloss: 1.79651\ttraining's loss is: : 2.36572\tvalid_1's multi_logloss: 1.92355\tvalid_1's loss is: : 1.42824\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's multi_logloss: 1.79651\ttraining's loss is: : 2.36572\tvalid_1's multi_logloss: 1.92355\tvalid_1's loss is: : 1.42824\n",
      "fold n°4\n",
      "Training until validation scores don't improve for 100 rounds\n",
      "[50]\ttraining's multi_logloss: 1.70379\ttraining's loss is: : 2.27265\tvalid_1's multi_logloss: 2.91788\tvalid_1's loss is: : 3.32694\n",
      "[100]\ttraining's multi_logloss: 1.70379\ttraining's loss is: : 2.27265\tvalid_1's multi_logloss: 2.91788\tvalid_1's loss is: : 3.32694\n",
      "Early stopping, best iteration is:\n",
      "[1]\ttraining's multi_logloss: 1.70379\ttraining's loss is: : 2.27265\tvalid_1's multi_logloss: 2.91788\tvalid_1's loss is: : 3.32694\n",
      "Wall time: 9.7 s\n"
     ]
    }
   ],
   "source": [
    "%%time\n",
    "from sklearn.model_selection import StratifiedKFold,KFold\n",
    "params = {\n",
    "        'task':'train', \n",
    "        'num_leaves': 255,\n",
    "        'objective': 'multiclass',\n",
    "        'num_class': 8,\n",
    "        'min_data_in_leaf': 50,\n",
    "        'learning_rate': 0.05,\n",
    "        'feature_fraction': 0.85,\n",
    "        'bagging_fraction': 0.85,\n",
    "        'bagging_freq': 5, \n",
    "        'max_bin':128,\n",
    "        'random_state':100\n",
    "    }   \n",
    "\n",
    "folds = KFold(n_splits=5, shuffle=True, random_state=15)\n",
    "oof = np.zeros(len(train))\n",
    "\n",
    "predict_res = 0\n",
    "models = []\n",
    "for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)):\n",
    "    print(\"fold n°{}\".format(fold_))\n",
    "    trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values)\n",
    "    val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values) \n",
    "    \n",
    "    clf = lgb.train(params, trn_data, num_boost_round=2000,valid_sets=[trn_data,val_data], verbose_eval=50, early_stopping_rounds=100, feval=lgb_logloss) \n",
    "    models.append(clf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.3.4 特征重要性分析"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "feature_importance             = pd.DataFrame()\n",
    "feature_importance['fea_name'] = train_features\n",
    "feature_importance['fea_imp']  = clf.feature_importance()\n",
    "feature_importance             = feature_importance.sort_values('fea_imp',ascending = False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x23a32b5c3c8>"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAABKMAAAJOCAYAAABr8MR3AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nOzde7htZV0v8O8PUMRUEPCCgG5MyvTJKFeYprXLS1opmpxETbE0K/WYmqcsKxWrx0tlxzCVlCKOx0uWRmreUNK8IAvdctEURDwQpBhIIqmB7/ljjMWeLNZl7rX2eufea38+z7OeNeYY7xjjnb/1jjnn+s4x5qzWWgAAAACgh71m3QEAAAAA9hzCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdLPPrDvQ28EHH9y2bNky624AAAAAbBpnn332V1trt5um7R4XRm3ZsiXz8/Oz7gYAAADAplFVX5q2rcv0AAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuZh5GVdVDq+pzVXVhVT1vieX7VtWbx+VnVtWWRcvvXFXXVNVze/UZAAAAgLWZaRhVVXsneVWShyW5R5LHVtU9FjV7cpKrWmt3S/KKJC9dtPwVSf5po/sKAAAAwPrN+syoo5Nc2Fq7qLX27SRvSnLMojbHJDllnH5rkgdWVSVJVT0yyUVJzu/UXwAAAADWYdZh1KFJLpm4fek4b8k2rbXrklyd5KCq+q4kv5XkRR36CQAAAMBOMOswqpaY16Zs86Ikr2itXbPqTqqeWlXzVTV/xRVXrKGbAAAAAOwM+8x4/5cmOXzi9mFJLlumzaVVtU+S/ZNcmeQ+SY6tqpclOSDJd6rqm621ExfvpLV2UpKTkmRubm5x2AUAAABAJ7MOo85KcmRVHZHk35Icl+Rxi9qcluT4JB9LcmySD7TWWpIHLDSoqhcmuWapIAoAAACAXcdMw6jW2nVV9Ywk70myd5KTW2vnV9UJSeZba6cleX2SU6vqwgxnRB03ux4DAAAAsB41nGS055ibm2vz8/Oz7gYAAADAplFVZ7fW5qZpO+sPMAcAAABgDyKMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgm5mHUVX10Kr6XFVdWFXPW2L5vlX15nH5mVW1ZZz/4Ko6u6rOHX//ZO++AwAAALBjZhpGVdXeSV6V5GFJ7pHksVV1j0XNnpzkqtba3ZK8IslLx/lfTfLw1tr3Jzk+yal9eg0AAADAWs36zKijk1zYWruotfbtJG9KcsyiNsckOWWcfmuSB1ZVtdY+1Vq7bJx/fpJbVNW+XXoNAAAAwJrMOow6NMklE7cvHect2aa1dl2Sq5MctKjNo5N8qrX2raV2UlVPrar5qpq/4oordkrHAQAAANhxsw6jaol5bUfaVNU9M1y69yvL7aS1dlJrba61Nne7291uTR0FAAAAYP1mHUZdmuTwiduHJblsuTZVtU+S/ZNcOd4+LMnbkjyxtfaFDe8tAAAAAOsy6zDqrCRHVtURVXXzJMclOW1Rm9MyfEB5khyb5AOttVZVByR5Z5Lfbq19pFuPAQAAAFizmYZR42dAPSPJe5J8NslbWmvnV9UJVfWIsdnrkxxUVRcmeU6S543zn5Hkbkl+r6q2jT+373wXAAAAANgB1drij2ja3Obm5tr8/PysuwEAAACwaVTV2a21uWnazvoyPQAAAAD2IMIoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG52KIyqqttU1a03qjMAAAAAbG5ThVFVNVdV5yY5J8l5VfXpqrr3xnYNAAAAgM1mnynbnZzkaa21DydJVd0/yV8luddGdQwAAACAzWfay/S+vhBEJUlr7V+SfH1jugQAAADAZjXtmVGfqKrXJnljkpbkMUnOqKofSpLW2ic3qH8AAAAAbCLThlFHjb9fsGj+/TKEUz+503oEAAAAwKY1VRjVWvuJje4IAAAAAJvfVGFUVR2Q5IlJtkyu01p75sZ0CwAAAIDNaNrL9N6V5ONJzk3ynY3rDgAAAACb2bRh1C1aa8/Z0J4AAAAAsOntNWW7U6vql6vqkKo6cOFnQ3sGAAAAwKYz7ZlR307y8iTPz/DteRl/33UjOgUAAADA5jRtGPWcJHdrrX11IzsDAAAAwOY27WV65ye5diM7AgAAAMDmN20YdX2SbVX12qp65cLPzuhAVT20qj5XVRdW1fOWWL5vVb15XH5mVW2ZWPbb4/zPVdVP7Yz+AAAAALBxpr1M7+3jz05VVXsneVWSBye5NMlZVXVaa+0zE82enOSq1trdquq4JC9N8piqukeS45LcM8mdkry/qr6ntXb9zu4nAAAAADvHVGFUa+2UDdr/0UkubK1dlCRV9aYkxySZDKOOSfLCcfqtSU6sqhrnv6m19q0kX6yqC8ftfWyD+goAAADAOq0YRlXVW1prP19V52b7t+jdoLV2r3Xu/9Akl0zcvjTJfZZr01q7rqquTnLQOP/ji9Y9dKmdVNVTkzw1Se585zuvs8sAAAAArNVqZ0b9+vj7Zzdo/7XEvMWh13Jtpll3mNnaSUlOSpK5ubkl2wAAAACw8VYMo1prl4+/v7RSu6r6WGvtvmvY/6VJDp+4fViSy5Zpc2lV7ZNk/yRXTrkuAAAAALuQab9NbzW3WON6ZyU5sqqOqKqbZ/hA8tMWtTktyfHj9LFJPtBaa+P848Zv2zsiyZFJPrHGfgAAAADQwbTfpreaNV36Nn4G1DOSvCfJ3klObq2dX1UnJJlvrZ2W5PVJTh0/oPzKDIFVxnZvyfBh59clebpv0gMAAADYtdVwktE6N1L1ydbaD+2E/my4ubm5Nj8/P+tuAAAAAGwaVXV2a21umrY76zK9pT5MHAAAAABuZGeFUU/YSdsBAAAAYBObKoyqqh+pqrOq6pqq+nZVXV9V/7mwvLV23sZ1EQAAAIDNYtozo05M8tgkFyTZL8lTkvz5RnUKAAAAgM1p6m/Ta61dWFV7j99Y91dV9dEN7BcAAAAAm9C0YdS1VXXzJNuq6mVJLk/yXRvXLQAAAAA2o2kv03vC2PYZSb6R5PAkj96oTgEAAACwOU11ZlRr7UtVtV+SQ1prL9rgPgEAAACwSU37bXoPT7ItybvH20dV1Wkb2TEAAAAANp9pL9N7YZKjk3wtSVpr25Js2ZguAQAAALBZTRtGXddau3pDewIAAADApjftt+mdV1WPS7J3VR2Z5JlJPrpx3QIAAABgM1rxzKiqOnWc/EKSeyb5VpI3JvnPJM/a2K4BAAAAsNmsdmbUvavqLkkek+QnkvzJxLJbJvnmRnUMAAAAgM1ntTDqNRm+Qe+uSeYn5leSNs4HAAAAgKmseJlea+2VrbXvS3Jya+2uEz9HtNYEUQAAAADskKm+Ta+19msb3REAAAAANr+pwigAAAAA2BmEUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG5mFkZV1YFV9b6qumD8fdtl2h0/trmgqo4f592yqt5ZVf9aVedX1Uv69h4AAACAtZjlmVHPS3J6a+3IJKePt2+kqg5M8oIk90lydJIXTIRWf9xau3uSH0zyo1X1sD7dBgAAAGCtZhlGHZPklHH6lCSPXKLNTyV5X2vtytbaVUnel+ShrbVrW2sfTJLW2reTfDLJYR36DAAAAMA6zDKMukNr7fIkGX/ffok2hya5ZOL2peO8G1TVAUkenuHsqiVV1VOrar6q5q+44op1dxwAAACAtdlnIzdeVe9PcsclFj1/2k0sMa9NbH+fJG9M8srW2kXLbaS1dlKSk5Jkbm6uLdcOAAAAgI21oWFUa+1Byy2rqi9X1SGttcur6pAkX1mi2aVJtk7cPizJGRO3T0pyQWvtz3ZCdwEAAADYYLO8TO+0JMeP08cn+Ycl2rwnyUOq6rbjB5c/ZJyXqvqDJPsneVaHvgIAAACwE8wyjHpJkgdX1QVJHjzeTlXNVdXrkqS1dmWSFyc5a/w5obV2ZVUdluFSv3sk+WRVbauqp8ziTgAAAAAwvWptz/oIpbm5uTY/Pz/rbgAAAABsGlV1dmttbpq2szwzCgAAAIA9jDAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuZhZGVdWBVfW+qrpg/H3bZdodP7a5oKqOX2L5aVV13sb3GAAAAID1muWZUc9Lcnpr7cgkp4+3b6SqDkzygiT3SXJ0khdMhlZV9XNJrunTXQAAAADWa5Zh1DFJThmnT0nyyCXa/FSS97XWrmytXZXkfUkemiRVdaskz0nyBx36CgAAAMBOMMsw6g6ttcuTZPx9+yXaHJrkkonbl47zkuTFSf4kybWr7aiqnlpV81U1f8UVV6yv1wAAAACs2T4bufGqen+SOy6x6PnTbmKJea2qjkpyt9bas6tqy2obaa2dlOSkJJmbm2tT7hsAAACAnWxDw6jW2oOWW1ZVX66qQ1prl1fVIUm+skSzS5Nsnbh9WJIzktw3yb2r6uIM9+H2VXVGa21rAAAAANhlzfIyvdOSLHw73vFJ/mGJNu9J8pCquu34weUPSfKe1tqrW2t3aq1tSXL/JJ8XRAEAAADs+mYZRr0kyYOr6oIkDx5vp6rmqup1SdJauzLDZ0OdNf6cMM4DAAAAYDdUre1ZH6E0NzfX5ufnZ90NAAAAgE2jqs5urc1N03aWZ0YBAAAAsIcRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdCOMAgAAAKAbYRQAAAAA3QijAAAAAOhGGAUAAABAN8IoAAAAALoRRgEAAADQjTAKAAAAgG6EUQAAAAB0I4wCAAAAoBthFAAAAADdCKMAAAAA6EYYBQAAAEA3wigAAAAAuhFGAQAAANCNMAoAAACAboRRAAAAAHQjjAIAAACgG2EUAAAAAN0IowAAAADoRhgFAAAAQDfCKAAAAAC6EUYBAAAA0I0wCgAAAIBuhFEAAAAAdFOttVn3oauquiLJl2bdj2UcnOSrs+7Ebkz91kf91kf91k7t1kf91kf91k7t1kf91kf91kf91k7t1kf91mdXr99dWmu3m6bhHhdG7cqqar61Njfrfuyu1G991G991G/t1G591G991G/t1G591G991G991G/t1G591G99NlP9XKYHAAAAQDfCKAAAAAC6EUbtWk6adQd2c+q3Puq3Puq3dmq3Puq3Puq3dmq3Puq3Puq3Puq3dmq3Puq3Ppumfj4zCgAAAIBunBkFAAAAQDfCKAAAAAC6EUYBAAAA0M0eFUZV1TOr6rNVdVVVPW+c98Kqeu4atvWrVfXEJeZvqarzdkZ/J7b5rqo6YGduc5X93ahOVfWsqvpDdVr/vnaFMVhVR1XVT0/cfsRCX5Zoe82O9mu9FtXo3eM8NRr29dFVlu8K42uuql65g/taUx/XYmfUqKq2VtX9NmuNdtRydZhYviuMy13ymF6rPbGmq42zNWxv09ewqn5nhWU7fF/3hJrtqF2kJpvmOUU9d8xyrwvVccV9bK2q+y0xf6FmTc362WfWHejsaUke1lr74no31Fp7zU7oz7T7+unVW+1UN6pTVV2c5C1r2dBmrdM69rUrjMGjkswlede4ndOSnLbe/uxEN9RovS8kN1uNWms3efJcZObjq7U2n2R+vfvfQDujRluTXNNa++O1rLwb1GiHTDFWZj4us4se0+uwx9V0A15P7Ak1/J0kf7QTt7cn1GxHzbwmm+w5RT13wAqvC9VxeVuTXJNkcZD3tCQPS3Jua+0l69nBJqzZhtljzoyqqtckuWuS06rq2VV14hJtvruq3l1VZ1fVh6vq7its74YUsqruXVWfrqqPJXn6Kv3YMm77k+PP/cb5W6vqQ1X1tqr6TFW9pqr2GpddXFUHr7DNJ1bVOWMfTh3n3aWqTh/nn15Vdx7n/3VVHTux7jUT+z+jqr6Q5HuTfGqs04eS3CnJ8Ul+dXet07i9z1bVX1bV+VX13qrab1x2RlXNjdMHj+FbqupJVfX34329oKpeNrG9G/ZVVc+vqs9V1fur6o0T93fxdr+e7WPwA1X15ao6J8m9J7a7obWtqpsnOSHJY6pqW1U9ZryfJ47Lj6iqj1XVWVX14pX+RmP736yqc8d9v2Scd1RVfXwce2+rqtvuQJ2/luTIsUbvT3LLqtqW5Od2xxpNHFdvrap/rao3VFWNyybH0FxVnTHR55PH9S6qqmdObG/heK2qOnE8Bt5Zw5l6780wvj5XVb87Lp9L8qRxne+qqr+tqqur6trxONiIY3drVb1jivtyw3GT4TFnYf6Sf9+q+oca32Wqql+pqjes0Ie71XA8frqGx4/vHmu2Lcn3JPlsVZ0y1mhrksdNrPs345g+u6q+Obb55Djv7lW1JcNj4bOr6t+r6lW7aY3OqKqXVtUnqurzVfWAcf4NY328/Y6xRqmqa2o4S/bTNRzjd5i4DzcZK1X18qq6MsO4/NDY14VtPy7Jd4/rHF/Du7fXVtUVVfVDK/R7Vzim/7mq3jLW7SVV9fixjudW1cJ9ul1V/d24zbOq6kfH+UdX1Uer6lPj7++dqPuSzzdL9GHmr2dmVNPJfi45fqe1CWt4SA2vjbZV1XlV9YAanpP3G+e9YWy35GPKNDZhzaYddw+vqjPHY/b9tf1x75VVddZYkw9U1RdmUZOJ+7IrPKe8YhyHn62qH67hMe2CqvqDiXa/MNZ4W1W9tqr2Hue/uqq+kuE5+uMLY6yG14pbkzxr/Ls8WD1vVM/F/8e9taquyvbX0rP8n3dXqOMza3itfE5Vvalu/BpuWw2PlUdU1b9nGHtnJrnZHl6zacfe28f9nF9VTx3n3WVsd3BV7TX24SEr3ee01vaYnyQXJzk4wz9nJ47zXpjkueP06UmOHKfvk+QDK2xrcr1zkvz4OP3yJOetsN4tk9xinD4yyfw4vTXJNzM8qe2d5H1Jjp3s9zLbu2eSzy0sT3Lg+Psfkxw/Tv9SkreP03+9sN3x9jUT+786yWHj/s5K8odJThxvv2w3r9OWJNclOWq8/ZYkvzBOn5Fkbpw+OMnF4/STklyUZLF1M68AAA5bSURBVP8kt0jypSSHLxpL905y7tjf2yS5cOL+3mS7489zkvzdWNt9k1yW5I861vZJGcf/4tsZ3l184jj99IXxscx2HpbhXYVbLhp7k305Icmf7WCdr0tyr3H+t2d0nO6sGm3N9uNqryQfS3L/xeM1w7u8Z0z0+aPj2Dg4yX8kudmi4/XnMoz9vTOExV9Lcuy4zf+X5BkZxtfcOO+5Gd4dPz/D8XTA2O6MDajd1iTvWOm+ZOXjZsm/b5I7jO0ekOTzGcfbMn04M8mjxulbjPt59Fizi5N839iXk8f+fn5i/5ck+a1x+rIknx+nn5bkdZO12c1rdEaSPxmnfzrJ+5cZ++9IsnWcbkkePk6/LMnvTjNWxpo/I8n7s/04+nySV4/3+aok9xrnn5jkol38mP5akkPGv9m/JXnRuOzXs/3x7v9m+7F+5ySfHadvk2SfcfpBSf5umcfBG55vlunHxZn965neNZ3s5xlZYvzuyM8mq+FvJHn+OL13kluP09dMtFn2MWUPrdm04+62SWqcfsrEuLtlhufUfx9r+b9mWJOt2TWeU146Ub/LJmp7aZKDMjz3/mO2v6b5i4m/18JryIuTfCTJ72b7/yDvyvCc+7Tx76SeyUGTx3hu+nrzm0l+JrM9VneFOl6WZN9x+oDF92fycSPDWPvNsXZ7cs3OyHRjb+GY3S/Da72F+U9J8tYMj4mvXW4/Cz972mV6y6qqWyW5X5K/reHEhWQo+mrr7Z9hcP/zOOvUDP+kL+dmSU6sqqOSXJ8hhV3widbaReN235jk/hn+mCv5ySRvba19NUlaa1eO8++b7WeTnJrhH4fVfKK1dul4/8/LMPD/e7LBblynJPlia23bOH12hoBqNae31q4e9/WZJHfJ8M/qggckeVtr7dqxzTSnim9NcnSGF4z3z3AQH9yxtiv50Qz/tC9s56UrtH1Qkr9auO+ttSuX6MspSf52iv1O1vnbSQ5fqtFuWKNkPK7GfmzLMO7+ZZV13tla+1aSb9XwTuEdMjwBLPixJG9srV2f5LKq+sAU/X5ohheC54y3986i43spO6F2S92XJY+blf6+rbUvV9XvJ/lghqDpyiyhqm6d5NDW2tvG9b45zr9/kjcm+f0kVyT51yS3X7TurTI84T69qh6b4THwG+PiszNxht6i9XarGk34+4n7tmWKfn47Qzi1sM6DJxeusQ5bM4SjZ473pzK8EFzRjI/ps1prl4/9+EKS947zz03yE+P0g5LcY+JvdJtxbO6f5JSqOjJDuHezie2u9nwzld30cXKami62o+N3arthDc9KcnJV3SzDm4/blmizltcrU9sNa5ZMN+4OS/Lmqjokyc2TfDFJWmvXVtUvZwhO/izD88pdJjc+w5rM6jllYUydm+T8idpelOF13f0z/CN91riv/ZJ8ZVzn58ezK+6U5FYZXqtcPy777Pj7M0nuuFRfV7KJ6/kfi9pPvt5ceC19k+fTPWxcnpPkDVX19iRvX6bNwuPGCRlOVjhhcYM9rGbJdGPvmVX1qLHd4Rne7P6P1trrqup/ZDgD7ajV7qAwaru9knyttbZq0RapDC8op/XsJF9O8gPZnlwvWLydabY77f4X2lw37jc1jNSbT7T51sT09Vn6n9XdtU7JTe/ffuP0DTXJ8I70Sussdcwst//ltltJ/k+GM7+eUVUvzHDtcq/armbabe3ofqetc7L8Y9PuVqNk+TG0keNu4Vlo8bj7amvt9jddZUXrrd1y92Wpba729/3+DE+Ad1phf7WD8yfrtVeGwOX3W2t/XcPlAfcdly33d1jY9u5Uo8X7XW5cJjceQ//dxre9snQ9VqrDZJ0zse5eSb7ZWtvvpqusaFc5pr8zcfs7ufH9um9r7b8mV6yqP0/ywdbao2q4XOCMZba70nhbze7+OLlcTZdbZz21Ws5uVcPW2oeq6scynAlxalW9vLX2N2vd3hrtVjUbTTPu/jzJn7bWTqvhkuUXTqzz/WPbO2Y4Q2qxWdVk1s8pk7VcuL1Phvt1SmvttydXqqojMpz59MNJPpXhLPKbZXsYtfC7JblOPZd9XNzVX0vPoo4/k+EN3Eck+b2quucy7Va7X3tSzSb3u+TYGx8LH5Thdc61NXzUyC2SpKpumSHET4Zg+esr7WiP+cyo1bTW/jPJF8ckLzX4gSnW+1qSq8d33ZPk8aussn+Sy1tr30nyhAyBz4Kja7huda8kj8nqZ08kw+l4P19VB439PnCc/9Ekx030aWFbF2f7ZxQdkxu/K7ucr2cMrXbjOq3k4myvybErtFvKh5I8qqr2G9/1fvgU2/1ghjPaFo6/g5LcvGNtv57k1sss+0huPG5W8t4kvzQ+6KSqDhzf1b+qtn9+xxOSLLwbcHF2vM7fGd/pTdJ1/O2sGq3k4myvx6NXaLeUDyU5rqr2Ht+tnTxz4JJsP0tgcrv/lOT6idr94AbVbhpLHjcr/X2r6ugM7yb9YJLnji9el+rvfya5tKoeOa637zhGP5Th8SIZjrnvzfCO7JeS3C7DY0xlOC5/eGKTS71wudH42N1qtIqLkxxVw7X+h2c4i3Mqq9ThkgyXq2Xc7qHj/A8m2auq/ue47JZV9Yh17mspPY7pSe/NcGlikqSGs3yT4bnt38bpJ+2kfd3IJnucnIndrYZVdZckX2mt/WWS1ydZ+Ny1/554Dl3p9cq67W412wGTx+zxCzPHmv9GhktYHpjh4yNupGNNpjGr55RJpyc5tqpuP27/wLGOt8lwFvLVGZ6DH7jM+t9I8k31XJ89ZVzW8H/i4a21D2a4/O6AbA9HJh9DJh83lvz/ZE+p2Q7YP8lVYxB19yQ/MrHspUnekOFKhL9cbUPCqBt7fJInV9WnM1wHfsyU6/1iklfV8MFk/7VK279IcnxVfTzDpWffmFj2sSQvyXCJ3BeTvG21HbfWzs/w2U7/PPb7T8dFz0zyizV8QPYTMlzzmQyD4ser6hMZriX9RlZ3UpJfyPgB5tkN67SKP07yazV8PeqyHxS/lNbaJ5O8Ocm2DJ8D9eEptntqhhc2j6nh6z1/NtuPxR61/WCGy0e2VdVjFi379QyXKJ2V4YFmWa21d2c4jXO+hsvPFr5W9PgkLx/H3lHZfrrrWup8foZTbCcvj9ptarSKFyX531X14Wx/129ab0tyQYbTZ1+d7YFfMlyP/vgMNZvc7osz3K+Tq+q/MjyZbUTtVrXKcXOTv29V7ZvhseuXWmuXZfgH4OSqqiztCRlOHz4nQzB/xww1OyfDO0Jvy3Aq9rWttUvG/fxGhifPM5L82Lj/O2Xp06n/McmjMjwmLjyh7241Ws5HMjyunpvhmP3kDq6/XB3OTPLVJI8dt3t5krTWrhjX+f/t3V+IVGUYx/HvDw00gyAh8KKSLoJCKUsRKkqJiiCKyOiioroLIutCgihCyIjqQoyK/tA/aq+KKO2ilEKyIBTK/FsEdVEURUQ3QQr6dHGONG2ruzuzntnV7weWnTk773ueeZiZM/uc933PuvZ1+Qf/Fg373ddYunhP91oNLE2zaOo+/j1+PgU8keRz/nuSZaqdKJ+TwzSTcrgC2JnkK5qTEBva7S8Bu5KMjPOZMlVmUs4mai3NtJdtNJ9hR2YWvELzvecQ8ED7HMZ6T3eRk3EN8ZjSG8M+mrWgNrfH5y3Agqr6mmZE1F6a74fbj9HN/tGxTnD3J1w+B3QyvC5nAW8l2U3z+lrfFoc20RRzdqY5eX4/zRpzC2gKo0dzMuRsoj6kGSG1i+b/iy8AklxJc0L3yaoaAQ4muftYHR1ZkE9Dlma425qqun7YsUxn0z1PaafcVZ+XfJf6keR1mgUPJ7J2mtSJNNPQPqiqRUMORZIkSdOMI6MkSZIkSZLUGUdGjSPJw8Atoza/XVWPj9PuWv5/FY8fquqmsR4/gTjm08y1Hu2qqhp9NYXOmafjZ9i5TbKYZmphrwNVtXwy/RxP5qh/w87dIJI8R3MVlF4bquq1Kd6POerYsHM+k9/TR2NOB2cOJ8+c/d+wczKI6XhMMZ9TFot5nPx+zdmgcViMkiRJkiRJUlecpidJkiRJkqTOWIySJEmSJElSZyxGSZIkSZIkqTMWoyRJkvqUZHWS/UlGhh2LJEnSTOEC5pIkSX1K8g1wXVX9MOxYJEmSZgpHRkmSJPUhyQvAucDGJA8neTXJjiRfJbmxfczCJNuSfNn+XHqM/lYk2ZrknSTfJBlJkvZvj7Z970nyUs/2rUnWJ/m0HaG1LMm7Sb5Lsq6n79uTbE+yM8mLSWYd3+xIkiQdncUoSZKkPlTVPcDPwEpgHvBJVS1r7z+dZB7wG3B1VV0M3Ao8M063S4AHgAtoCl2XtdufraplVbUImAtc39PmYFVdAbwAvA/cCywC7koyP8n57b4vq6qLgEPAbYM9e0mSpP7NHnYAkiRJJ4BrgBuSrGnvzwHOpilWPZvkSBHovHH62V5VPwEk2QksBD4DViZ5EDgVOAPYC2xq22xsf+8G9lbVL23774GzgMuBS4Ad7YCquTRFMkmSpKGwGCVJkjS4ADdX1bf/2ZisBX4FLqQZkf73OP0c6Ll9CJidZA7wPLC0qn5s+5wzRpvDo9ofpvmuF+CNqnpoMk9IkiTpeHGaniRJ0uA+Au7rWctpSbv9dOCXqjoM3AH0s1bTkcLT70lOA1ZNsv3HwKokZ7axnZHknD7ikCRJmhIWoyRJkgb3GHAKsCvJnvY+NCOa7kzyBc0Uvb8m23FV/Qm8TDMN7z1gxyTb7wMeATYn2QVsARZMNg5JkqSpkqoadgySJEmSJEk6STgySpIkSZIkSZ1xAXNJkqQOJVkMvDlq84GqWj6MeCRJkrrmND1JkiRJkiR1xml6kiRJkiRJ6ozFKEmSJEmSJHXGYpQkSZIkSZI6YzFKkiRJkiRJnfkHzsY0rysG+q0AAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 1440x720 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "plt.figure(figsize=[20, 10,])\n",
    "sns.barplot(x = feature_importance['fea_name'], y = feature_importance['fea_imp'])\n",
    "#sns.barplot(x=\"fea_name\",y=\"fea_imp\",data=feature_importance)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3.3.5 模型测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "pred_res = 0\n",
    "fold = 5\n",
    "for model in models:\n",
    "    pred_res +=model.predict(test_submit[train_features]) * 1.0 / fold "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_submit['prob0'] = 0\n",
    "test_submit['prob1'] = 0\n",
    "test_submit['prob2'] = 0\n",
    "test_submit['prob3'] = 0\n",
    "test_submit['prob4'] = 0\n",
    "test_submit['prob5'] = 0\n",
    "test_submit['prob6'] = 0\n",
    "test_submit['prob7'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "test_submit[['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']] = pred_res\n",
    "test_submit[['file_id','prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']].to_csv('baseline.csv',index = None)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.1"
  },
  "latex_envs": {
   "LaTeX_envs_menu_present": true,
   "autoclose": false,
   "autocomplete": true,
   "bibliofile": "biblio.bib",
   "cite_by": "apalike",
   "current_citInitial": 1,
   "eqLabelWithNumbers": true,
   "eqNumInitial": 1,
   "hotkeys": {
    "equation": "Ctrl-E",
    "itemize": "Ctrl-I"
   },
   "labels_anchors": false,
   "latex_user_defs": false,
   "report_style_numbering": false,
   "user_envs_cfg": false
  },
  "toc": {
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": true,
   "toc_position": {
    "height": "calc(100% - 180px)",
    "left": "10px",
    "top": "150px",
    "width": "384px"
   },
   "toc_section_display": true,
   "toc_window_display": true
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
